zinbModelFromW <- function(counts, sim_W){
mod <- model.matrix(~ sim_W)
zinb_sim <- zinbFit(counts, ncores = 3, X=mod, which_X_mu=1:3, which_X_pi=1:3, commondispersion=FALSE)
true_alpha_mu <- zinb_sim@beta_mu[-1,]
true_alpha_pi <- zinb_sim@beta_pi[-1,]
true_beta_mu <- zinb_sim@beta_mu[1,,drop=FALSE]
true_beta_pi <- zinb_sim@beta_pi[1,,drop=FALSE]
true_gamma_mu <- zinb_sim@gamma_mu
true_gamma_pi <- zinb_sim@gamma_pi
true_zeta <- zinb_sim@zeta
zinbModel(W=sim_W, alpha_mu=true_alpha_mu, alpha_pi=true_alpha_pi,
beta_mu=true_beta_mu, beta_pi=true_beta_pi,
zeta = true_zeta, gamma_mu=true_gamma_mu,
gamma_pi=true_gamma_pi)
}
simSummary = data.frame(realData = rep('Allen', 3),
K = rep(2, 3),
nclusters = rep(3, 3),
corMuPi = rep('no', 3),
dim = rep('1000x100', 3),
zeroFract = c(.25, .5, .75),
nreplicates = 10)
kable(simSummary)
| realData | K | nclusters | corMuPi | dim | zeroFract | nreplicates |
|---|---|---|---|---|---|---|
| Allen | 2 | 3 | no | 1000x100 | 0.25 | 10 |
| Allen | 2 | 3 | no | 1000x100 | 0.50 | 10 |
| Allen | 2 | 3 | no | 1000x100 | 0.75 | 10 |
To simulate a dataset, steps are as follow
Using this procedure, we have the simulated counts and true \(W\), \(\gamma_{\mu}\), \(\gamma_{\pi}\), \(\alpha_{\mu}\), \(\alpha_{\pi}\), \(\beta_{\mu}\), \(\beta_{\pi}\), and \(\zeta\).
data("allen")
prefilter <- allen[grep("^ERCC-", rownames(allen), invert = TRUE),
which(colData(allen)$Core.Type=="Core")]
filterGenes = apply(assay(prefilter) > 5, 1, sum) >= 5
postfilter <- prefilter[filterGenes, ]
core <- assay(postfilter)
vars <- rowVars(log1p(core))
names(vars) <- rownames(core)
vars <- sort(vars, decreasing = TRUE)
core <- core[names(vars)[1:1000],]
bioIni = as.factor(colData(postfilter)$driver_1_s)
cl <- as.factor(colData(postfilter)$Primary.Type)
pZero <- rowMeans(log1p(core) == 0)
names(pZero) <- rownames(core)
sum(core == 0)/(nrow(core) * ncol(core))
## [1] 0.339014
The dimensions are
dim(core)
## [1] 1000 285
par(mfrow = c(1,2))
fq <- betweenLaneNormalization(core, which="full")
pca <- prcomp(t(log1p(fq)))
plot(pca$x, col=cols[bioIni], pch=19, main="PCA of log-counts")
#legend("bottomleft", levels(bio), fill=cols)
plot(pca$x, col=cols2[cl], pch=19, main="PCA of log-counts")
Fit data with K = 2, V and X intercepts in both Mu and Pi, commondispersion = FALSE, and no covariate.
print(system.time(zinb <- zinbFit(core, ncores = 3, K = 2,
commondispersion = FALSE)))
## user system elapsed
## 342.893 180.993 207.666
par(mfrow = c(1,1))
# zinb
xlim = c(min(zinb@W[,1]) - .5, max(zinb@W[,1]) + .5)
ylim = c(min(zinb@W[,2]) - .5, max(zinb@W[,2]) + .5)
plot(zinb@W, col = cols[bioIni], xlim = xlim, ylim = ylim,
main = 'Fitted W')
# mclust
mm = Mclust(zinb@W, G = 3)
plot(mm, what = 'classification', xlab = 'W1', ylab = 'W2',
xlim = xlim, ylim = ylim)
# multivar gaussian
sim_W = lapply(mm$classification, function(i){
mvrnorm(n = 1, mu = mm$parameters$mean[, i],
Sigma = mm$parameters$variance$sigma[,, i])
})
sim_W = do.call(rbind, sim_W)
bio = mm$classification
plot(sim_W, col = bio,
main = 'true W (multivar gauss sim)',
xlab = 'W1', ylab = 'W2', xlim = xlim, ylim = ylim)
core = assay(postfilter)
pZero <- rowMeans(log1p(core) == 0)
names(pZero) = rownames(core)
chosenGenes = c(sample(names(pZero)[pZero > 0.75], 100),
sample(names(pZero)[pZero<0.75 & pZero>0.6],300),
sample(names(pZero)[pZero<0.6 & pZero>0.2], 500),
sample(names(pZero)[pZero<0.2], 100))
core50 = core[chosenGenes, ]
sum(core50 == 0)/(nrow(core50) * ncol(core50))
## [1] 0.4931895
simModel = simModel50 = zinbModelFromW(core50, sim_W)
simData = simData50 = zinbSim(simModel, seed = 8746)
detection_rate <- colSums(core50>0)
coverage <- colSums(core50)
df <- data.frame(gamma_mu = simModel@gamma_mu[1, ],
gamma_pi = simModel@gamma_pi[1, ],
detection_rate = detection_rate,
coverage = coverage)
pairs(df, pch=19, col=bio)
print(cor(df, method="spearman"))
## gamma_mu gamma_pi detection_rate coverage
## gamma_mu 1.0000000 -0.2833412 0.4483783 0.9416495
## gamma_pi -0.2833412 1.0000000 -0.9542971 -0.3960176
## detection_rate 0.4483783 -0.9542971 1.0000000 0.5218290
## coverage 0.9416495 -0.3960176 0.5218290 1.0000000
df <- data.frame(alpha_mu_1 = simModel@alpha_mu[1, ],
alpha_mu_2 = simModel@alpha_mu[2, ],
alpha_pi_1 = simModel@alpha_pi[1, ],
alpha_pi_2 = simModel@alpha_pi[2, ])
pairs(df, pch=19, main = 'alpha')
print(cor(df, method="spearman"))
## alpha_mu_1 alpha_mu_2 alpha_pi_1 alpha_pi_2
## alpha_mu_1 1.00000000 0.00847088 -0.12261899 0.1081034
## alpha_mu_2 0.00847088 1.00000000 0.01868779 -0.1116836
## alpha_pi_1 -0.12261899 0.01868779 1.00000000 0.1517242
## alpha_pi_2 0.10810337 -0.11168357 0.15172423 1.0000000
df <- data.frame(beta_mu = simModel@beta_mu[1, ],
beta_pi = simModel@beta_pi[1, ])
pairs(df, pch=19, main = 'Beta')
print(cor(df, method="spearman"))
## beta_mu beta_pi
## beta_mu 1.0000000 -0.1067677
## beta_pi -0.1067677 1.0000000
chosenGenes = c(sample(names(pZero)[pZero > 0.4], 110),
sample(names(pZero)[pZero<0.4 & pZero>0.2], 490),
sample(names(pZero)[pZero<0.2], 400))
core25 <- core[chosenGenes, ]
sum(core25 == 0)/(nrow(core25) * ncol(core25))
## [1] 0.2494596
simModel = simModel25 = zinbModelFromW(core25, sim_W)
simData = simData25 = zinbSim(simModel, seed = 8746)
detection_rate <- colSums(core25>0)
coverage <- colSums(core25)
df <- data.frame(gamma_mu = simModel@gamma_mu[1, ],
gamma_pi = simModel@gamma_pi[1, ],
detection_rate = detection_rate,
coverage = coverage)
pairs(df, pch=19, col=bio)
print(cor(df, method="spearman"))
## gamma_mu gamma_pi detection_rate coverage
## gamma_mu 1.0000000 -0.3680141 0.5131362 0.9357462
## gamma_pi -0.3680141 1.0000000 -0.9523503 -0.3982634
## detection_rate 0.5131362 -0.9523503 1.0000000 0.5071295
## coverage 0.9357462 -0.3982634 0.5071295 1.0000000
df <- data.frame(alpha_mu_1 = simModel@alpha_mu[1, ],
alpha_mu_2 = simModel@alpha_mu[2, ],
alpha_pi_1 = simModel@alpha_pi[1, ],
alpha_pi_2 = simModel@alpha_pi[2, ])
pairs(df, pch=19, main = 'alpha')
print(cor(df, method="spearman"))
## alpha_mu_1 alpha_mu_2 alpha_pi_1 alpha_pi_2
## alpha_mu_1 1.00000000 -0.05371179 -0.19891845 0.1373080
## alpha_mu_2 -0.05371179 1.00000000 0.04851486 -0.2007947
## alpha_pi_1 -0.19891845 0.04851486 1.00000000 0.1900043
## alpha_pi_2 0.13730799 -0.20079465 0.19000434 1.0000000
df <- data.frame(beta_mu = simModel@beta_mu[1, ],
beta_pi = simModel@beta_pi[1, ])
pairs(df, pch=19, main = 'Beta')
print(cor(df, method="spearman"))
## beta_mu beta_pi
## beta_mu 1.0000000 -0.2206999
## beta_pi -0.2206999 1.0000000
chosenGenes = c(sample(names(pZero)[pZero > 0.8], 700),
sample(names(pZero)[pZero < 0.8 & pZero > 0.3], 200),
sample(names(pZero)[pZero < 0.3], 100))
core75 <- core[chosenGenes, ]
sum(rowSums(core75) == 0)
## [1] 0
core75 = core75[rowSums(core75) != 0, ]
sum(core75 == 0)/(nrow(core75) * ncol(core75))
## [1] 0.7567018
simModel = simModel75 = zinbModelFromW(core75, sim_W)
simData = simData75 = zinbSim(simModel, seed = 8746)
detection_rate <- colSums(core75>0)
coverage <- colSums(core75)
df <- data.frame(gamma_mu = simModel@gamma_mu[1, ],
gamma_pi = simModel@gamma_pi[1, ],
detection_rate = detection_rate,
coverage = coverage)
pairs(df, pch=19, col=bio)
print(cor(df, method="spearman"))
## gamma_mu gamma_pi detection_rate coverage
## gamma_mu 1.0000000 -0.2657099 0.4290702 0.9109961
## gamma_pi -0.2657099 1.0000000 -0.9561455 -0.4233761
## detection_rate 0.4290702 -0.9561455 1.0000000 0.5332160
## coverage 0.9109961 -0.4233761 0.5332160 1.0000000
df <- data.frame(alpha_mu_1 = simModel@alpha_mu[1, ],
alpha_mu_2 = simModel@alpha_mu[2, ],
alpha_pi_1 = simModel@alpha_pi[1, ],
alpha_pi_2 = simModel@alpha_pi[2, ])
pairs(df, pch=19, main = 'alpha')
print(cor(df, method="spearman"))
## alpha_mu_1 alpha_mu_2 alpha_pi_1 alpha_pi_2
## alpha_mu_1 1.0000000 0.1159766 0.12211908 0.14437814
## alpha_mu_2 0.1159766 1.0000000 0.07254640 0.13098009
## alpha_pi_1 0.1221191 0.0725464 1.00000000 0.07101794
## alpha_pi_2 0.1443781 0.1309801 0.07101794 1.00000000
df <- data.frame(beta_mu = simModel@beta_mu[1, ],
beta_pi = simModel@beta_pi[1, ])
pairs(df, pch=19, main = 'Beta')
print(cor(df, method="spearman"))
## beta_mu beta_pi
## beta_mu 1.0000000 -0.1395131
## beta_pi -0.1395131 1.0000000
core = core50
simData = simData50
simModel = simModel50
sim_detection_rate <- rowMeans(simData$counts>0)
sim_coverage <- rowSums(simData$counts)
par(mfrow=c(1, 2))
plot(rowMeans(getPi(simModel)), sim_detection_rate,
xlab="Average simulated Pi", ylab="True Detection Rate",
pch=19, col=bio, ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(simModel))), log1p(sim_coverage), xlim = c(4, 7),
xlab="Average simulated log Mu", ylab="log Coverage", pch=19,
col=bio,ylim = c(9,14))
pal =colorRampPalette(c("black","black", "red","yellow"), space="rgb")
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
par(mfrow=c(1, 2))
smoothScatter(mean, rowMeans(core == 0),xlim = c(3,8),
nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
colramp = pal, main = 'Observed')
smoothScatter(colMeans(log1p(getMu(simModel))), colMeans(getPi(simModel)), nbin = 256,
nrpoints=Inf, pch="", cex=.7, xlim = c(3,8),
xlab = "Mean Expression", main = 'Simulated',
ylab = "Dropout Rate",ylim = c(0,1),
colramp = pal)
core = core25
simData = simData25
simModel = simModel25
sim_detection_rate <- rowMeans(simData$counts>0)
sim_coverage <- rowSums(simData$counts)
par(mfrow=c(1, 2))
plot(rowMeans(getPi(simModel)), sim_detection_rate,
xlab="Average simulated Pi", ylab="True Detection Rate",
pch=19, col=bio, ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(simModel))), log1p(sim_coverage), xlim = c(4, 7),
xlab="Average simulated log Mu", ylab="log Coverage", pch=19,
col=bio,ylim = c(9,14))
pal =colorRampPalette(c("black","black", "red","yellow"), space="rgb")
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
par(mfrow=c(1, 2))
smoothScatter(mean, rowMeans(core == 0),xlim = c(3,8),
nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
colramp = pal, main = 'Observed')
smoothScatter(colMeans(log1p(getMu(simModel))), colMeans(getPi(simModel)), nbin = 256,
nrpoints=Inf, pch="", cex=.7, xlim = c(3,8),
xlab = "Mean Expression", main = 'Simulated',
ylab = "Dropout Rate",ylim = c(0,1),
colramp = pal)
simData = simData75
simModel = simModel75
core = core75
sim_detection_rate <- rowMeans(simData$counts>0)
sim_coverage <- rowSums(simData$counts)
par(mfrow=c(1, 2))
plot(rowMeans(getPi(simModel)), sim_detection_rate,
xlab = "Average simulated Pi", ylab = "True Detection Rate",
pch = 19, col = bio, ylim = c(0, 1), xlim = c(0,1))
plot(rowMeans(log1p(getMu(simModel))), log1p(sim_coverage), xlim = c(4, 7),
xlab="Average simulated log Mu", ylab="log Coverage", pch=19,
col=bio,ylim = c(9,14))
pal =colorRampPalette(c("black","black", "red","yellow"), space="rgb")
mean = apply(log1p(core), 1, function(x) mean(x[x!=0]))
par(mfrow=c(1, 2))
smoothScatter(mean, rowMeans(core == 0),xlim = c(3,8),
nrpoints=Inf, pch = "", cex = .7, ylim = c(0,1),
xlab = "Mean Expression", ylab = "Dropout Rate",
colramp = pal, main = 'Observed')
smoothScatter(colMeans(log1p(getMu(simModel))), colMeans(getPi(simModel)), nbin = 256,
nrpoints=Inf, pch="", cex=.7, xlim = c(3,8),
xlab = "Mean Expression", main = 'Simulated',
ylab = "Dropout Rate",ylim = c(0,1),
colramp = pal)
B = 10
bio = as.vector(bio)
originalCounts = core50
simModel = simModel50
simData = lapply(seq_len(B), function(i) zinbSim(simModel, seed=i))
save(originalCounts, bio, simModel, simData, file = "./datasets/sim_allen5.rda")
zero5 = sapply(simData, function(x) x$zeroFraction)
originalCounts = core25
simModel = simModel25
simData = lapply(seq_len(B), function(i) zinbSim(simModel, seed=i))
save(originalCounts, bio, simModel, simData, file = "./datasets/sim_allen25.rda")
zero25 = sapply(simData, function(x) x$zeroFraction)
originalCounts = core75
simModel = simModel75
simData = lapply(seq_len(B), function(i) zinbSim(simModel, seed=i))
save(originalCounts, bio, simModel, simData,
file = "./datasets/sim_allen75.rda")
zero75 = sapply(simData, function(x) x$zeroFraction)
zeroFrac = data.frame(zero = c(zero5, zero25, zero75),
perc = c(rep(.5, 10), rep(.25, 10), rep(.75, 10)))
zeroFrac$perc = factor(zeroFrac$perc)
zeroFrac = melt(zeroFrac)
ggplot(zeroFrac, aes(x = perc, y = value)) + geom_boxplot() +
ylab('zero fraction') + xlab('wanted zero fraction') + ggtitle('one boxplot = 10 replicates')